Adatok olvasása és statisztika a két mérés adatain

Nem azt méri -> szakmailag nem jó

Nem jól kitölthető

Az eloszlás azonos -> kitöltsé változása azonos

Személyenként a tesztek távolsága (Hemming, cos, euklideszi,...)

In [1]:
%matplotlib inline
import pandas as pd 
import matplotlib.pyplot as plt
file_1 = r"shhs1-dataset-0.14.0.csv"
file_2 = r"shhs2-dataset-0.14.0.csv"
file_3 = r"shhs-interim-followup-dataset-0.14.0.csv"
shhs1 = pd.read_csv(file_1, engine='python')
shhs2 = pd.read_csv(file_2, engine='python')
shhs_inter = pd.read_csv(file_3, engine='python')
In [2]:
shhs = shhs1.merge(shhs2, on="nsrrid")
shhs = shhs.merge(shhs_inter, on="nsrrid")
shhs.columns = map(str.lower, shhs.columns)
shhs
Out[2]:
nsrrid pptid_x ecgdate_x lvh3_1_x lvh3_3_x st4_1_3_x st5_1_3_x lvhst_x mob1_x part2deg_x ... chfdt cabgptcadt carotidenddt completeddt_stat readin_stat ess_interim visitnumber gender race age_s1
0 200077 77 114.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN 737.0 749.0 4.0 4 1 1 41
1 200078 78 -375.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 ... NaN NaN NaN 809.0 815.0 3.0 4 1 1 54
2 200079 79 356.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN 809.0 815.0 9.0 4 2 3 56
3 200080 80 -397.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 ... NaN NaN NaN 777.0 786.0 12.0 4 1 1 54
4 200081 81 2.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 ... NaN NaN NaN 573.0 579.0 1.0 4 2 1 40
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
4075 205798 5833 -704.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... NaN NaN NaN 580.0 592.0 12.0 4 1 1 59
4076 205799 5834 -907.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... NaN NaN NaN 545.0 557.0 12.0 4 2 1 54
4077 205800 5835 -854.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... NaN NaN NaN 553.0 561.0 5.0 4 1 1 66
4078 205801 5836 -755.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... NaN NaN NaN 632.0 645.0 11.0 4 1 1 54
4079 205802 5837 -768.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... NaN NaN NaN 603.0 616.0 18.0 4 1 1 55

4080 rows × 2683 columns

In [3]:
shhs.ess_s1.hist(bins = 24)
Out[3]:
<AxesSubplot:>
In [4]:
shhs.ess_interim.hist(bins = 24)
Out[4]:
<AxesSubplot:>
In [5]:
shhs.ess_s2.hist(bins = 24)
Out[5]:
<AxesSubplot:>

Weibull

In [6]:
import matplotlib.pyplot as plt
import numpy as np
import scipy
import scipy.stats
size = 30000
x = np.arange(np.nan_to_num(shhs.ess_s2).shape[0])
y = shhs.ess_s1 #scipy.int_(np.round_(scipy.stats.vonmises.rvs(5,size=size)*47))
y = y[np.logical_not(np.isnan(y))]
h = plt.hist(y, bins=range(48))

dist_names = ['gamma', 'beta', 'rayleigh', 'norm', 'pareto', 'loggamma']

for dist_name in dist_names:
    dist = getattr(scipy.stats, dist_name)
    params = dist.fit(y)
    print(dist_name)
    print(params)
    
    arg = params[:-2]
    loc = params[-2]
    scale = params[-1]
    if arg:
        pdf_fitted = dist.pdf(x, *arg, loc=loc, scale=scale) * size
    else:
        pdf_fitted = dist.pdf(x, loc=loc, scale=scale) * size
    plt.plot(pdf_fitted, label=dist_name)
    plt.xlim(0,47)
plt.legend(loc='upper right')
plt.show()
gamma
(5.073053561926454, -2.2073209650998042, 1.9504017073256645)
beta
(2.7453967765987652, 8.855531017668572, -0.8596464927558829, 36.11947143211091)
rayleigh
(-0.5297185124041954, 6.566688217027271)
norm
(7.687169145449962, 4.327259749206995)
C:\ProgramData\Anaconda3\lib\site-packages\scipy\stats\_continuous_distns.py:621: RuntimeWarning: invalid value encountered in sqrt
  sk = 2*(b-a)*np.sqrt(a + b + 1) / (a + b + 2) / np.sqrt(a*b)
C:\ProgramData\Anaconda3\lib\site-packages\scipy\stats\_distn_infrastructure.py:2429: RuntimeWarning: invalid value encountered in double_scalars
  Lhat = muhat - Shat*mu
pareto
(123739587.48476316, -950333913.3333936, 950333913.3333936)
loggamma
(1919.6282535073897, -1455.288320546395, 193.52436254057034)
In [7]:
from scipy.stats import beta
#mean, var, skew, kurt =beta.stats(2.7453967765987652, 8.855531017668572, moments='mvsk')
In [8]:
import matplotlib.pyplot as plt
import numpy as np
import scipy
import scipy.stats
size = 30000
x = np.arange(np.nan_to_num(shhs.ess_interim).shape[0])
y = shhs.ess_interim
y = y[np.logical_not(np.isnan(y))] #scipy.int_(np.round_(scipy.stats.vonmises.rvs(5,size=size)*47))
h = plt.hist(y, bins=range(48))

dist_names = ['gamma', 'rayleigh', 'norm', 'pareto', 'loggamma']

for dist_name in dist_names:
    dist = getattr(scipy.stats, dist_name)
    params = dist.fit(y)
    print(dist_name)
    print(params)
    
    arg = params[:-2]
    loc = params[-2]
    scale = params[-1]
    if arg:
        pdf_fitted = dist.pdf(x, *arg, loc=loc, scale=scale) * size
    else:
        pdf_fitted = dist.pdf(x, loc=loc, scale=scale) * size
    plt.plot(pdf_fitted, label=dist_name)
    plt.xlim(0,47)
plt.legend(loc='upper right')
plt.show()
gamma
(4.227121503015497, -1.6183094638443076, 2.0748916611474844)
rayleigh
(-0.6114133922509992, 6.241979594019017)
norm
(7.152501985702939, 4.200689545128268)
pareto
(81852702.89122652, -477678162.1998472, 477678162.1998472)
C:\ProgramData\Anaconda3\lib\site-packages\scipy\stats\_distn_infrastructure.py:2429: RuntimeWarning: invalid value encountered in double_scalars
  Lhat = muhat - Shat*mu
loggamma
(3634.675616503091, -2072.501970012594, 253.67189003238593)
In [9]:
import matplotlib.pyplot as plt
import numpy as np
import scipy
import scipy.stats
size = 30000
x = np.arange(np.nan_to_num(shhs.ess_s2).shape[0])
y = shhs.ess_s2 #scipy.int_(np.round_(scipy.stats.vonmises.rvs(5,size=size)*47))
y = y[np.logical_not(np.isnan(y))]
h = plt.hist(y, bins=range(48))

dist_names = ['gamma', 'beta', 'rayleigh', 'norm', 'pareto', 'loggamma']

for dist_name in dist_names:
    dist = getattr(scipy.stats, dist_name)
    params = dist.fit(y)
    print(dist_name)
    print(params)
    
    arg = params[:-2]
    loc = params[-2]
    scale = params[-1]
    if arg:
        pdf_fitted = dist.pdf(x, *arg, loc=loc, scale=scale) * size
    else:
        pdf_fitted = dist.pdf(x, loc=loc, scale=scale) * size
    plt.plot(pdf_fitted, label=dist_name)
    plt.xlim(0,47)
plt.legend(loc='upper right')
plt.show()
gamma
(4.880758264917269, -2.126760697841364, 1.928101967994961)
beta
(2.7002607142601676, 9.231922903134295, -0.8812785340439031, 36.07888008713343)
rayleigh
(-0.6145689117248834, 6.322756453486665)
norm
(7.283841693975297, 4.191544246371809)
C:\ProgramData\Anaconda3\lib\site-packages\scipy\stats\_continuous_distns.py:621: RuntimeWarning: invalid value encountered in sqrt
  sk = 2*(b-a)*np.sqrt(a + b + 1) / (a + b + 2) / np.sqrt(a*b)
C:\ProgramData\Anaconda3\lib\site-packages\scipy\stats\_distn_infrastructure.py:2429: RuntimeWarning: invalid value encountered in double_scalars
  Lhat = muhat - Shat*mu
pareto
(85691746.46722403, -594820000.63553, 594820000.63553)
loggamma
(1816.436934816431, -1343.4640367373322, 179.9941657318344)

Wilcoxon

In [10]:
from scipy import stats
#w, p = 
stats.wilcoxon(shhs.ess_s1, shhs.ess_s2)
Out[10]:
WilcoxonResult(statistic=2338820.5, pvalue=5.6910857139563165e-37)
In [11]:
stats.wilcoxon(shhs.ess_interim, shhs.ess_s2)
Out[11]:
WilcoxonResult(statistic=2192366.0, pvalue=7.071771583285873e-43)
In [12]:
stats.wilcoxon(shhs.ess_s1, shhs.ess_interim)
Out[12]:
WilcoxonResult(statistic=1890632.0, pvalue=1.049603260873792e-87)

Kolmogorov-Smirnov

In [13]:
stats.ks_2samp(shhs.ess_s1, shhs.ess_s2)
Out[13]:
KstestResult(statistic=0.03848039215686275, pvalue=0.004752780852290926)
In [14]:
stats.ks_2samp(shhs.ess_interim, shhs.ess_s1)
Out[14]:
KstestResult(statistic=0.047058823529411764, pvalue=0.00023772968264865603)
In [15]:
stats.ks_2samp(shhs.ess_interim, shhs.ess_s2)
Out[15]:
KstestResult(statistic=0.04877450980392157, pvalue=0.0001214945023617946)

Úgy tűnik nincs eltérés a két mérés eloszlása között.

Mérések közti eltérések vizsgálata

In [16]:
import plotly.express as px
fig = px.density_heatmap(shhs, x="ess_s1", y="ess_s2")
fig.show()
In [17]:
fig = px.density_heatmap(shhs, x="ess_s2", y="ess_interim")
fig.show()
In [18]:
import plotly.express as px
fig = px.scatter_3d(shhs.loc[shhs.mdsa02>0,:], x="ess_s1", y="ess_s2", z="ess_interim", color = "mdsa02", opacity=0.3)
fig.show()
In [19]:
import numpy as np

np.nanmean(shhs.ess_s1.values-shhs.ess_s2.values)
Out[19]:
0.3965829666062646
In [20]:
np.nanmean(shhs.ess_s1.values-shhs.ess_interim.values)
Out[20]:
0.5203915171288744
In [21]:
np.nanmean(-shhs.ess_s2.values+shhs.ess_interim.values)
Out[21]:
-0.12248230811105063
In [22]:
plt.hist(shhs.ess_s1.values-shhs.ess_s2.values)
Out[22]:
(array([2.000e+00, 1.000e+00, 8.000e+00, 9.000e+01, 5.800e+02, 1.855e+03,
        1.165e+03, 1.370e+02, 2.100e+01, 4.000e+00]),
 array([-24. , -19.7, -15.4, -11.1,  -6.8,  -2.5,   1.8,   6.1,  10.4,
         14.7,  19. ]),
 <BarContainer object of 10 artists>)

Concordance correlation coefficient

In [23]:
import numpy as np

def concordance_correlation_coefficient(y_true, y_pred,
                       sample_weight=None,
                       multioutput='uniform_average'):
    """Concordance correlation coefficient.
    The concordance correlation coefficient is a measure of inter-rater agreement.
    It measures the deviation of the relationship between predicted and true values
    from the 45 degree angle.
    Read more: https://en.wikipedia.org/wiki/Concordance_correlation_coefficient
    Original paper: Lawrence, I., and Kuei Lin. "A concordance correlation coefficient to evaluate reproducibility." Biometrics (1989): 255-268.  
    Parameters
    ----------
    y_true : array-like of shape = (n_samples) or (n_samples, n_outputs)
        Ground truth (correct) target values.
    y_pred : array-like of shape = (n_samples) or (n_samples, n_outputs)
        Estimated target values.
    Returns
    -------
    loss : A float in the range [-1,1]. A value of 1 indicates perfect agreement
    between the true and the predicted values.
    Examples
    --------
    >>> from sklearn.metrics import concordance_correlation_coefficient
    >>> y_true = [3, -0.5, 2, 7]
    >>> y_pred = [2.5, 0.0, 2, 8]
    >>> concordance_correlation_coefficient(y_true, y_pred)
    0.97678916827853024
    """
    cor=np.corrcoef(y_true,y_pred)[0][1]
    
    mean_true=np.mean(y_true)
    mean_pred=np.mean(y_pred)
    
    var_true=np.var(y_true)
    var_pred=np.var(y_pred)
    
    sd_true=np.std(y_true)
    sd_pred=np.std(y_pred)
    
    numerator=2*cor*sd_true*sd_pred
    
    denominator=var_true+var_pred+(mean_true-mean_pred)**2

    return numerator/denominator

n_samples=1000
y_true = np.arange(n_samples)
y_pred = y_true + 5
plt.scatter(shhs.ess_s2, shhs.ess_interim)
Out[23]:
<matplotlib.collections.PathCollection at 0x17c71c169e8>
In [24]:
concordance_correlation_coefficient(np.nan_to_num(shhs.ess_s1), np.nan_to_num(shhs.ess_s2))
Out[24]:
0.5824460983267877
In [25]:
concordance_correlation_coefficient(np.nan_to_num(shhs.ess_s1), np.nan_to_num(shhs.ess_interim))
Out[25]:
0.5788553018519524
In [26]:
concordance_correlation_coefficient(np.nan_to_num(shhs.ess_s2), np.nan_to_num(shhs.ess_interim))
Out[26]:
0.5595391936678727

A cikkben ez szerepelt: "The Rc value for Lin's concordance coefficient was 0.748 (the correlation is poor if the value is less than 0.9)."

Odds ratio

In [27]:
shhs
Out[27]:
nsrrid pptid_x ecgdate_x lvh3_1_x lvh3_3_x st4_1_3_x st5_1_3_x lvhst_x mob1_x part2deg_x ... chfdt cabgptcadt carotidenddt completeddt_stat readin_stat ess_interim visitnumber gender race age_s1
0 200077 77 114.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN 737.0 749.0 4.0 4 1 1 41
1 200078 78 -375.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 ... NaN NaN NaN 809.0 815.0 3.0 4 1 1 54
2 200079 79 356.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN 809.0 815.0 9.0 4 2 3 56
3 200080 80 -397.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 ... NaN NaN NaN 777.0 786.0 12.0 4 1 1 54
4 200081 81 2.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 ... NaN NaN NaN 573.0 579.0 1.0 4 2 1 40
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
4075 205798 5833 -704.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... NaN NaN NaN 580.0 592.0 12.0 4 1 1 59
4076 205799 5834 -907.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... NaN NaN NaN 545.0 557.0 12.0 4 2 1 54
4077 205800 5835 -854.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... NaN NaN NaN 553.0 561.0 5.0 4 1 1 66
4078 205801 5836 -755.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... NaN NaN NaN 632.0 645.0 11.0 4 1 1 54
4079 205802 5837 -768.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... NaN NaN NaN 603.0 616.0 18.0 4 1 1 55

4080 rows × 2683 columns

Az egyes mutatók változásai

In [28]:
shhs_interim_ess = shhs.loc[:,["sitread","watchtv","sitpublic","ridecar","resting","sittalk","sitafterlunch","stoppedcar","atdinner", "whiledriving"]]
shhs_interim_ess
Out[28]:
sitread watchtv sitpublic ridecar resting sittalk sitafterlunch stoppedcar atdinner whiledriving
0 2.0 3.0 1.0 1.0 2.0 1.0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0 4.0 1.0 1.0 1.0 1.0 1.0
2 2.0 2.0 1.0 3.0 4.0 1.0 3.0 1.0 1.0 1.0
3 3.0 3.0 3.0 3.0 4.0 1.0 2.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0 2.0 1.0 1.0 1.0 1.0 1.0
... ... ... ... ... ... ... ... ... ... ...
4075 3.0 4.0 2.0 1.0 4.0 1.0 4.0 1.0 1.0 1.0
4076 4.0 4.0 1.0 1.0 4.0 1.0 4.0 1.0 1.0 1.0
4077 2.0 3.0 1.0 2.0 2.0 1.0 1.0 1.0 1.0 1.0
4078 3.0 4.0 2.0 2.0 4.0 1.0 2.0 1.0 1.0 1.0
4079 4.0 4.0 4.0 4.0 4.0 2.0 3.0 1.0 1.0 2.0

4080 rows × 10 columns

In [29]:
shhs_1_ess = shhs.loc[:,["sitrd02","watv02","sitpub02","pgrcar02","lydwn02","sittlk02","sitlch02","incar02","attabl02","drive02"]]
shhs_1_ess

#SitRd02	WaTV02	SitPub02	PgrCar02	LyDwn02	SitTlk02	SitLch02	InCar02	AtTabl02	Drive02
#sh319a	sh319b	sh319c	sh319d	sh319e	sh319f	sh319g	sh319h	sh319i	sh319j
#sitRead	watchTV	sitPublic	rideCar	resting	sitTalk	sitAfterLunch	stoppedCar	atDinner	whileDriving	bpmeds	aspirin	drive
#["sitrd02","watv02","sitpub02","pgrcar02","lydwn02","sittlk02","sitlch02","incar02","attabl02","drive02"]
#["sh319a","sh319b","sh319c","sh319d","sh319e","sh319f","sh319g","sh319h","sh319i","sh319j"]
#["sitread","watchtv","sitpublic","ridecar","resting","sittalk","sitafterLunch","stoppedcar","atdinner", "whiledriving"]
Out[29]:
sitrd02 watv02 sitpub02 pgrcar02 lydwn02 sittlk02 sitlch02 incar02 attabl02 drive02
0 2.0 3.0 2.0 2.0 2.0 1.0 1.0 1.0 1.0 1.0
1 1.0 2.0 1.0 2.0 4.0 1.0 2.0 1.0 1.0 1.0
2 1.0 1.0 1.0 3.0 3.0 1.0 1.0 1.0 1.0 1.0
3 4.0 2.0 3.0 3.0 4.0 1.0 2.0 1.0 1.0 1.0
4 1.0 1.0 2.0 1.0 3.0 1.0 1.0 1.0 1.0 1.0
... ... ... ... ... ... ... ... ... ... ...
4075 1.0 2.0 1.0 1.0 2.0 1.0 3.0 1.0 1.0 1.0
4076 2.0 4.0 1.0 1.0 4.0 1.0 1.0 1.0 1.0 1.0
4077 2.0 4.0 2.0 1.0 2.0 1.0 2.0 1.0 1.0 1.0
4078 3.0 4.0 2.0 3.0 4.0 1.0 2.0 1.0 1.0 1.0
4079 4.0 4.0 4.0 3.0 4.0 2.0 3.0 1.0 1.0 2.0

4080 rows × 10 columns

In [30]:
shhs_2_ess = shhs.loc[:,["sh319a","sh319b","sh319c","sh319d","sh319e","sh319f","sh319g","sh319h","sh319i","sh319j"]]
shhs_2_ess
Out[30]:
sh319a sh319b sh319c sh319d sh319e sh319f sh319g sh319h sh319i sh319j
0 4.0 4.0 2.0 1.0 2.0 1.0 1.0 1.0 1.0 1.0
1 2.0 2.0 1.0 2.0 4.0 1.0 2.0 1.0 1.0 1.0
2 2.0 3.0 1.0 4.0 4.0 1.0 2.0 1.0 1.0 1.0
3 4.0 3.0 3.0 3.0 4.0 1.0 2.0 1.0 1.0 1.0
4 1.0 2.0 1.0 1.0 3.0 1.0 1.0 1.0 1.0 1.0
... ... ... ... ... ... ... ... ... ... ...
4075 3.0 3.0 1.0 2.0 4.0 2.0 3.0 1.0 1.0 2.0
4076 4.0 4.0 1.0 1.0 4.0 1.0 4.0 1.0 1.0 1.0
4077 3.0 3.0 1.0 2.0 2.0 1.0 2.0 1.0 1.0 1.0
4078 2.0 4.0 1.0 2.0 2.0 1.0 2.0 1.0 1.0 1.0
4079 4.0 4.0 4.0 3.0 4.0 2.0 3.0 2.0 1.0 1.0

4080 rows × 10 columns

In [31]:
plt.bar(["sitread","watchtv","sitpublic","ridecar","resting","sittalk","sitafterlunch","stoppedcar","atdinner", "whiledriving"],np.nanmean(shhs_2_ess.values-shhs_1_ess.values, axis = 0))
plt.xticks(rotation=45, ha='right')
Out[31]:
([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, '')])
In [32]:
plt.bar(["sitread","watchtv","sitpublic","ridecar","resting","sittalk","sitafterlunch","stoppedcar","atdinner", "whiledriving"],np.nanstd(shhs_2_ess.values-shhs_1_ess.values, axis = 0))
plt.xticks(rotation=45, ha='right')
Out[32]:
([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, '')])
In [33]:
plt.bar(["sitread","watchtv","sitpublic","ridecar","resting","sittalk","sitafterlunch","stoppedcar","atdinner", "whiledriving"],np.nanmean(shhs_2_ess.values-shhs_interim_ess.values, axis = 0))
plt.xticks(rotation=45, ha='right')
Out[33]:
([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, '')])
In [34]:
plt.bar(["sitread","watchtv","sitpublic","ridecar","resting","sittalk","sitafterlunch","stoppedcar","atdinner", "whiledriving"],np.nanstd(shhs_2_ess.values-shhs_interim_ess.values, axis = 0))
plt.xticks(rotation=45, ha='right')
Out[34]:
([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, '')])
In [35]:
plt.bar(["sitread","watchtv","sitpublic","ridecar","resting","sittalk","sitafterlunch","stoppedcar","atdinner", "whiledriving"],np.nanmean(shhs_1_ess.values-shhs_interim_ess.values, axis = 0))
plt.xticks(rotation=45, ha='right')
Out[35]:
([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, '')])
In [36]:
fig, axs = plt.subplots(10, figsize=(12, 23), sharey=True)
arr = ["sitread","watchtv","sitpublic","ridecar","resting","sittalk","sitafterlunch","stoppedcar","atdinner", "whiledriving"]
for i in range(10):
    axs[i].hist(shhs_2_ess.values[:,i]-shhs_1_ess.values[:,i], bins = 7)
    axs[i].set_title(arr[i])


plt.plot()
#plt.hist(shhs_2_ess.values[:,2]-shhs_1_ess.values[:,2], bins= 7)
Out[36]:
[]

Távolságok

Cos

In [37]:
import scipy.spatial as sp
cos_dist_m = 1-sp.distance.cdist(shhs_1_ess.values,shhs_interim_ess.values, 'cosine')
In [38]:
from scipy import spatial

cos_dist = []
euclidean_dist = []
mod_euclidean_dist = []
mink_dist = []

for i in range(shhs_1_ess.shape[0]):
    cos_dist.append(1 - spatial.distance.cosine(shhs_1_ess.values[i,:],shhs_interim_ess.values[i,:]))
    euclidean_dist.append(spatial.distance.euclidean(np.nan_to_num(shhs_1_ess.values[i,:]),np.nan_to_num(shhs_interim_ess.values[i,:])))
    mod_euclidean_dist.append(spatial.distance.euclidean(np.nan_to_num(shhs_1_ess.values[i,:])**3,np.nan_to_num(shhs_interim_ess.values[i,:])**3))
    mink_dist.append(spatial.distance.minkowski(np.nan_to_num(shhs_1_ess.values[i,:]),np.nan_to_num(shhs_interim_ess.values[i,:]),1))

print("Cos távolság:")    
print(np.nanmean(cos_dist))
print("\nEuklideszi távolság:")
print(np.nanmean(euclidean_dist))
print("\nBence féle módosított távolság:")
print(np.nanmean(mod_euclidean_dist))
print("\nMinkowski távolság:")
print(np.nanmean(mink_dist))
Cos távolság:
0.9552493495583483

Euklideszi távolság:
2.391918651395364

Bence féle módosított távolság:
40.471321229465424

Minkowski távolság:
4.912745098039216
In [39]:
plt.plot(np.array(mod_euclidean_dist)/10, alpha = 0.5)
plt.plot(euclidean_dist, alpha = 0.5)
Out[39]:
[<matplotlib.lines.Line2D at 0x17c71019d68>]

Lineáris regresszió

BMI + ESS + kor és RDI (espiratory disturbance index) kapcsolata

In [40]:
import numpy as np
from sklearn.linear_model import LinearRegression
X = np.nan_to_num(shhs.loc[:,["sitrd02","watv02","sitpub02","pgrcar02","lydwn02","sittlk02","sitlch02","incar02","attabl02","drive02", "age_s1", "bmi_s1"]].values)
# y = 1 * x_0 + 2 * x_1 + 3
y = np.nan_to_num(shhs.loc[:,['rdi0p_x']].values)
reg = LinearRegression().fit(X, y)
print(reg.score(X, y))

print(reg.coef_)

dict(zip(list(reg.coef_.flatten()), ["sitrd02","watv02","sitpub02","pgrcar02","lydwn02","sittlk02","sitlch02","incar02","attabl02","drive02", "age_s1", "bmi_s1"]))
0.11026019907963913
[[ 0.48739743  0.41460398  1.02845832 -1.35669726  0.10593086  0.96555673
   0.85838249  0.68690417 -1.47967041  1.74115315  0.27516322  1.01759849]]
Out[40]:
{0.4873974272424685: 'sitrd02',
 0.41460397638945823: 'watv02',
 1.0284583202648536: 'sitpub02',
 -1.3566972621189712: 'pgrcar02',
 0.10593086103595721: 'lydwn02',
 0.9655567311351835: 'sittlk02',
 0.8583824908129789: 'sitlch02',
 0.6869041703981125: 'incar02',
 -1.4796704102060994: 'attabl02',
 1.7411531490914134: 'drive02',
 0.2751632168124805: 'age_s1',
 1.0175984855450997: 'bmi_s1'}
In [41]:
X = np.nan_to_num(shhs.loc[:,["sh319a","sh319b","sh319c","sh319d","sh319e","sh319f","sh319g","sh319h","sh319i","sh319j", "age_s2", "bmi_s2"]].values)
# y = 1 * x_0 + 2 * x_1 + 3
y = np.nan_to_num(shhs.loc[:,['rdi0p_y']].values)
reg = LinearRegression().fit(X, y)
print(reg.score(X, y))

print(reg.coef_)

dict(zip(list(reg.coef_.flatten()), ["sitrd02","watv02","sitpub02","pgrcar02","lydwn02","sittlk02","sitlch02","incar02","attabl02","drive02", "age_s2", "bmi_s2"]))
0.13395572679186651
[[ 0.86423784  0.2473459   0.60348232 -0.0982708   0.961688   -0.17762223
   0.26377582 -1.52174295 -1.71346976  2.08210245  0.16314456  0.65863896]]
Out[41]:
{0.8642378383712407: 'sitrd02',
 0.24734589694100512: 'watv02',
 0.6034823246435668: 'sitpub02',
 -0.09827079913534006: 'pgrcar02',
 0.9616880047310314: 'lydwn02',
 -0.17762223286898232: 'sittlk02',
 0.26377582458674703: 'sitlch02',
 -1.5217429525999815: 'incar02',
 -1.7134697614769507: 'attabl02',
 2.082102449783908: 'drive02',
 0.1631445567303398: 'age_s2',
 0.6586389647869004: 'bmi_s2'}
In [42]:
plt.scatter(shhs.rdi0p_x, shhs.ess_s1)
Out[42]:
<matplotlib.collections.PathCollection at 0x17c7344fe48>
In [43]:
plt.hist(np.nanstd(shhs_2_ess.values-shhs_1_ess.values, axis = 1))
C:\ProgramData\Anaconda3\lib\site-packages\numpy\lib\nanfunctions.py:1665: RuntimeWarning:

Degrees of freedom <= 0 for slice.

Out[43]:
(array([ 181.,  427., 1422., 1031.,  611.,  221.,   78.,   23.,    4.,
           3.]),
 array([0.        , 0.19209373, 0.38418745, 0.57628118, 0.76837491,
        0.96046864, 1.15256236, 1.34465609, 1.53674982, 1.72884354,
        1.92093727]),
 <BarContainer object of 10 artists>)
In [44]:
plt.scatter(np.nanmean(shhs_2_ess.values-shhs_1_ess.values, axis = 1), np.nanstd(shhs_2_ess.values-shhs_1_ess.values, axis = 1))
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: RuntimeWarning:

Mean of empty slice

Out[44]:
<matplotlib.collections.PathCollection at 0x17c6de19b38>

érdekes, random számokkal nem jön ki a szökőkút

In [73]:
plt.scatter(np.nanmean(np.random.randint(low=-3, high=3, size=(50000,)).reshape(5000,10), axis = 1), np.nanstd(np.random.randint(low=-3, high=3, size=(50000,)).reshape(5000,10), axis = 1))
Out[73]:
<matplotlib.collections.PathCollection at 0x17c09f0f8d0>

Normál eloszlást használva sincs

In [90]:
plt.scatter(np.nanmean((np.random.normal(0, 1, 50000).astype(int)).reshape(5000,10), axis = 1), np.nanstd((np.random.normal(0, 1, 50000).astype(int)).reshape(5000,10), axis = 1))
Out[90]:
<matplotlib.collections.PathCollection at 0x17c0a071780>
In [46]:
no = (shhs_2_ess.values-shhs_1_ess.values)>0
csokken = (shhs_2_ess.values-shhs_1_ess.values)<0
np.sum(no, axis = 1)
szampar = pd.DataFrame({'no':np.sum(no, axis = 1), 'csokken':np.sum(csokken, axis = 1)})
fig = px.density_heatmap(szampar, x = "no", y = "csokken")
fig.show()
In [47]:
no = (shhs_2_ess.values-shhs_interim_ess.values)>0
csokken = (shhs_2_ess.values-shhs_interim_ess.values)<0
np.sum(no, axis = 1)
szampar = pd.DataFrame({'no':np.sum(no, axis = 1), 'csokken':np.sum(csokken, axis = 1)})
fig = px.density_heatmap(szampar, x = "no", y = "csokken")
fig.show()

Sankey-diagram

Amerikai besorolás

In [48]:
import plotly.graph_objects as go
import ipywidgets as widgets
import pandas as pd
import numpy as np


plot = shhs.copy()

#shhs.ess_s2, shhs.ess_interim

plot["ess_s1_cat"] = pd.cut((plot['ess_s1']),bins=[0,5,10,12,15,24],
                            labels=["Low normal", "Higher normal", "Mild","Moderate","Severe"])
plot["ess_interim_cat"] = pd.cut((plot['ess_interim']), bins=[0,5,10,12,15,24],
                            labels=["Low normal", "Higher normal", "Mild","Moderate","Severe"])
plot["ess_s2_cat"] = pd.cut((plot['ess_s2']),bins=[0,5,10,12,15,24],
                            labels=["Low normal", "Higher normal", "Mild","Moderate","Severe"])


#plot = plot.dropna(subset=['NECK20_cat'])
#plot = plot.dropna(subset=['Dias_cat'])

# Build parcats dimensions
categorical_dimensions = ['ess_s1_cat', 'ess_interim_cat', 'ess_s2_cat'] #['Elhizas', 'event', 'ahi']

dimensions = [dict(values=plot[label], label=label) for label in categorical_dimensions]

# Build colorscale
color = np.zeros(len(plot), dtype='uint8')
colorscale = [[0, 'gray'], [0.2, 'gray'],
              [0.2, 'firebrick'], [0.4, 'firebrick'],
              [0.4, 'Blue'], [0.6, 'Blue'],
              [0.6, 'Green'], [0.8, 'Green'],
              [0.8, 'Yellow'], [1.0, 'Yellow']              
             ]
cmin = -0.5
cmax = 4.5

# Build figure as FigureWidget
fig = go.FigureWidget(
    data=[go.Scatter(x=plot.ess_s1, y=plot.ess_s2,
                marker={'color': color, 'cmin': cmin, 'cmax': cmax,'opacity': 0.5,
                        'colorscale': colorscale, 'showscale': True,
                        'colorbar': {'tickvals': [0, 1, 2, 3, 4], 'ticktext': ['None', 'Red', 'Blue', 'Green', 'Yellow']}},
                     mode='markers'),

          go.Parcats(domain={'y': [0, 0.4]}, dimensions=dimensions,
                   line={'colorscale': colorscale, 'cmin': cmin,
                   'cmax': cmax, 'color': color, 'shape': 'hspline'})#,
          #go.Parcats(domain={'y': [0.2, 0.4]}, dimensions=dimensions,
          #         line={'colorscale': colorscale, 'cmin': cmin,
          #         'cmax': cmax, 'color': color, 'shape': 'hspline'})
          
          ]
)

fig.update_layout(height=600, xaxis={'title': 'Umap 1'},
                  yaxis={'title': 'Umap 2', 'domain': [0.6, 1]},
                  dragmode='lasso', hovermode='closest')
#fig.update_traces(showscale=False)
# Build color selection widget
color_toggle = widgets.ToggleButtons(
    options=['None', 'Red', 'Blue', 'Green', 'Yellow'],
    index=1, description='Brush Color:', disabled=False)

# Update color callback
def update_color(trace, points, state):
    # Compute new color array
    new_color = np.array(fig.data[0].marker.color)
    new_color[points.point_inds] = color_toggle.index

    with fig.batch_update():
        # Update scatter color
        fig.data[0].marker.color = new_color

        # Update parcats colors
        fig.data[1].line.color = new_color

# Register callback on scatter selection...
fig.data[0].on_selection(update_color)
# and parcats click
fig.data[1].on_click(update_color)

# Display figure
widgets.VBox([color_toggle, fig])

Magyar besorolás

In [50]:
import plotly.graph_objects as go
import ipywidgets as widgets
import pandas as pd
import numpy as np


plot = shhs.copy()

#shhs.ess_s2, shhs.ess_interim

plot["ess_s1_cat"] = pd.cut((plot['ess_s1']),bins=[0,8,10,14,24],
                            labels=["Nincs", "enyhe", "közepes","magas"])
plot["ess_interim_cat"] = pd.cut((plot['ess_interim']), bins=[0,8,10,14,24],
                            labels=["Nincs", "enyhe", "közepes","magas"])
plot["ess_s2_cat"] = pd.cut((plot['ess_s2']),bins=[0,8,10,14,24],
                            labels=["Nincs", "enyhe", "közepes","magas"])


#plot = plot.dropna(subset=['NECK20_cat'])
#plot = plot.dropna(subset=['Dias_cat'])

# Build parcats dimensions
categorical_dimensions = ['ess_s1_cat', 'ess_interim_cat', 'ess_s2_cat'] #['Elhizas', 'event', 'ahi']

dimensions = [dict(values=plot[label], label=label) for label in categorical_dimensions]

# Build colorscale
color = np.zeros(len(plot), dtype='uint8')
colorscale = [[0, 'gray'], [0.2, 'gray'],
              [0.2, 'firebrick'], [0.4, 'firebrick'],
              [0.4, 'Blue'], [0.6, 'Blue'],
              [0.6, 'Green'], [0.8, 'Green'],
              [0.8, 'Yellow'], [1.0, 'Yellow']              
             ]
cmin = -0.5
cmax = 4.5

# Build figure as FigureWidget
fig = go.FigureWidget(
    data=[go.Scatter(x=plot.ess_s1, y=plot.ess_s2,
                marker={'color': color, 'cmin': cmin, 'cmax': cmax,'opacity': 0.5,
                        'colorscale': colorscale, 'showscale': True,
                        'colorbar': {'tickvals': [0, 1, 2, 3, 4], 'ticktext': ['None', 'Red', 'Blue', 'Green', 'Yellow']}},
                     mode='markers'),

          go.Parcats(domain={'y': [0, 0.4]}, dimensions=dimensions,
                   line={'colorscale': colorscale, 'cmin': cmin,
                   'cmax': cmax, 'color': color, 'shape': 'hspline'})#,
          #go.Parcats(domain={'y': [0.2, 0.4]}, dimensions=dimensions,
          #         line={'colorscale': colorscale, 'cmin': cmin,
          #         'cmax': cmax, 'color': color, 'shape': 'hspline'})
          
          ]
)

fig.update_layout(height=600, xaxis={'title': 'Umap 1'},
                  yaxis={'title': 'Umap 2', 'domain': [0.6, 1]},
                  dragmode='lasso', hovermode='closest')
#fig.update_traces(showscale=False)
# Build color selection widget
color_toggle = widgets.ToggleButtons(
    options=['None', 'Red', 'Blue', 'Green', 'Yellow'],
    index=1, description='Brush Color:', disabled=False)

# Update color callback
def update_color(trace, points, state):
    # Compute new color array
    new_color = np.array(fig.data[0].marker.color)
    new_color[points.point_inds] = color_toggle.index

    with fig.batch_update():
        # Update scatter color
        fig.data[0].marker.color = new_color

        # Update parcats colors
        fig.data[1].line.color = new_color

# Register callback on scatter selection...
fig.data[0].on_selection(update_color)
# and parcats click
fig.data[1].on_click(update_color)

# Display figure
widgets.VBox([color_toggle, fig])
In [66]:
shhs["ess_s1_cat"] = pd.cut((shhs['ess_s1']),bins=[0,10,14,24],
                            labels=["0", "1", "2"])
shhs["ess_interim_cat"] = pd.cut((shhs['ess_interim']), bins=[0,10,14,24],
                            labels=["0", "1", "2"])
shhs["ess_s2_cat"] = pd.cut((shhs['ess_s2']),bins=[0,10,14,24],
                            labels=["0", "1", "2"])

plt.scatter(x = np.nanmean(shhs_2_ess.values-shhs_1_ess.values, axis = 1), 
            y = np.nanstd(shhs_2_ess.values-shhs_1_ess.values, axis = 1),
            c = np.nan_to_num(shhs["ess_s2_cat"].fillna('0').astype(int)-shhs["ess_s1_cat"].fillna('0').astype(int)))
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:8: RuntimeWarning:

Mean of empty slice

C:\ProgramData\Anaconda3\lib\site-packages\numpy\lib\nanfunctions.py:1665: RuntimeWarning:

Degrees of freedom <= 0 for slice.

Out[66]:
<matplotlib.collections.PathCollection at 0x17c09e39b00>
In [63]:
shhs.ess_s2_cat.fillna('0').astype(int)
Out[63]:
0       0
1       0
2       1
3       2
4       0
       ..
4075    2
4076    2
4077    0
4078    0
4079    3
Name: ess_s2_cat, Length: 4080, dtype: int32